Setup

n<-nrow(subset_selected)
set.seed(123)
n_s = sample(n,10000)
train = subset_selected[n_s, ]
crossValidationCont = function(df,K,a,nperfmeas=6)
{ set.seed(123)
  n = nrow(df)
  nhold = round(n/K) # size of holdout set 
  iperm = sample(n)
  perfmeas = matrix(0,K,nperfmeas)  
  models = vector(mode="list", length=3)
  year = vector(mode="list", length=3)
  for(k in 1:K)
  { indices = (((k-1)*nhold+1):(k*nhold))
    if( k==K ) indices = (((k-1)*nhold+1):n)
    indices = iperm[indices]
    traine = df[-indices,]
    holdoute = df[indices,]
    models[[k]] = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=traine)
    pred = predict(models[[k]],newdata=holdoute,interval="prediction",level=a)
    RSS = c(crossprod(models[[k]]$residuals))
    MSE = RSS / length(models[[k]]$residuals)
    x = intervalScore(pred, holdoute$price, a)
    y= sqrt(MSE)
    z = summary(models[[k]])$r.squared
    perfmeas[k,1:4] = x$summary
    perfmeas[k,5] = y
    perfmeas[k,6] = z
    year[[k]] = traine$year
  }
  avgperfmeas = apply(perfmeas,2,mean)
  list(perfmeasbyfold=perfmeas, avgperfmeas=avgperfmeas, models= models, year= year)
}
crossValidationCont2 = function(df,K,a,nperfmeas=6)
{ set.seed(123)
  n = nrow(df)
  nhold = round(n/K) # size of holdout set 
  iperm = sample(n)
  perfmeas = matrix(0,K,nperfmeas)  
  models = vector(mode="list", length=3)
  year = vector(mode="list", length=3)
  predi = vector(mode="list", length=3)
  pri = vector(mode="list", length=3)
  for(k in 1:K)
  { indices = (((k-1)*nhold+1):(k*nhold))
    if( k==K ) indices = (((k-1)*nhold+1):n)
    indices = iperm[indices]
    traine = df[-indices,]
    holdoute = df[indices,]
    models[[k]] = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=traine, weights = 1/(traine$year)^2)
    pred = predict(models[[k]],newdata=holdoute,interval="prediction",level=a, weights = 1/(holdoute$year)^2)
    RSS = c(crossprod(models[[k]]$residuals))
    MSE = RSS / length(models[[k]]$residuals)
    x = intervalScore(pred, holdoute$price, a)
    y= sqrt(MSE)
    z = summary(models[[k]])$r.squared
    perfmeas[k,1:4] = x$summary
    perfmeas[k,5] = y
    perfmeas[k,6] = z
    year[[k]] = traine$year
    predi[[k]] = pred
    pri[[k]] = holdoute$price
  }
  avgperfmeas = apply(perfmeas,2,mean)
  list(perfmeasbyfold=perfmeas, avgperfmeas=avgperfmeas, models= models, year= year, pred = predi, price = pri)
}
crossValidationCont3 = function(df,K,a,nperfmeas=6)
{ set.seed(123)
  n = nrow(df)
  nhold = round(n/K) # size of holdout set 
  iperm = sample(n)
  perfmeas = matrix(0,K,nperfmeas)  
  models = vector(mode="list", length=3)
  year = vector(mode="list", length=3)
  predi = vector(mode="list", length=3)
  pri = vector(mode="list", length=3)
  for(k in 1:K)
  { indices = (((k-1)*nhold+1):(k*nhold))
    if( k==K ) indices = (((k-1)*nhold+1):n)
    indices = iperm[indices]
    traine = df[-indices,]
    holdoute = df[indices,]
    models[[k]] = lm(I(log(price))~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=traine)
    pred1 = predict(models[[k]],newdata=holdoute,interval="prediction",level=a)
    pred = exp(pred1)
    RSS = c(crossprod(models[[k]]$residuals))
    MSE = RSS / length(models[[k]]$residuals)
    x = intervalScore(pred, holdoute$price, a)
    y= sqrt(MSE)
    z = summary(models[[k]])$r.squared
    perfmeas[k,1:4] = x$summary
    perfmeas[k,5] = y
    perfmeas[k,6] = z
    year[[k]] = traine$year
    predi[[k]] = pred
    pri[[k]] = holdoute$price
  }
  avgperfmeas = apply(perfmeas,2,mean)
  list(perfmeasbyfold=perfmeas, avgperfmeas=avgperfmeas, models= models, year= year, pred = predi, price = pri)
}
intervalScore = function(predObj,actual,level) { 
n = nrow(predObj)
alpha = 1-level
ilow = (actual<predObj[,2]) # overestimation
ihigh = (actual>predObj[,3]) # underestimation
sumlength = sum(predObj[,3]-predObj[,2]) # sum of lengths of prediction intervals 
sumlow = sum(predObj[ilow,2]-actual[ilow])*2/alpha
sumhigh = sum(actual[ihigh]-predObj[ihigh,3])*2/alpha
avglength = sumlength/n
IS = (sumlength+sumlow+sumhigh)/n # average length + average under/over penalties 
cover = mean(actual>= predObj[,2] & actual<=predObj[,3])
summ = c(level,avglength,IS,cover)
# summary with level, average length, interval score, coverage rate, r^2, rmse
imiss = which(ilow | ihigh)
list(summary=summ, imiss=imiss)
}
ols_step_best_subset(lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type+gdpc,data=train))
                         Best Subsets Regression                          
--------------------------------------------------------------------------
Model Index    Predictors
--------------------------------------------------------------------------
     1         year                                                        
     2         year car_type                                               
     3         year mileage_cat car_type                                   
     4         year mileage_cat fuel car_type                              
     5         mark_cat year mileage_cat fuel car_type                     
     6         mark_cat year mileage_cat log_vol_engine fuel car_type      
     7         mark_cat year mileage_cat log_vol_engine fuel car_type gdpc 
--------------------------------------------------------------------------

                                                                  Subsets Regression Summary                                                                  
--------------------------------------------------------------------------------------------------------------------------------------------------------------
                       Adj.        Pred                                                                                                                        
Model    R-Square    R-Square    R-Square       C(p)           AIC           SBIC            SBC            MSEP             FPE             HSP         APC  
--------------------------------------------------------------------------------------------------------------------------------------------------------------
  1        0.4331      0.4330      0.4327    11826.1632    213461.9378    185080.4132    213483.5688    1.091154e+12    109137238.4019    10914.8160    0.5672 
  2        0.6354      0.6352      0.6348     4041.5913    209056.6401    180669.9789    209107.1124    701879354586.3881     70230066.4790     7023.7099    0.3649 
  3        0.7074      0.7071      0.7067     1268.9664    206864.1012    178470.4096    206950.6253    563184727342.0479     56380481.1251     5638.6131    0.2928 
  4        0.7245      0.7241      0.7235      614.7308    206269.6787    177872.3313    206377.8338    530418063646.8668     53116154.1273     5312.1482    0.2758 
  5        0.7337      0.7333      0.7327      261.4335    205934.9011    177533.8372    206064.6872    512698419504.8883     51357128.5591     5136.2286    0.2666 
  6        0.7407      0.7402      0.7393       -3.6592    205672.8528    177272.1069    205809.8493    499387849825.0785     50028809.8706     5003.3841    0.2597 
  7        0.7407      0.7402      0.7393       -3.0000    205673.5096    177272.7693    205817.7164    499370749041.4341     50032100.9365     5003.7141    0.2597 
--------------------------------------------------------------------------------------------------------------------------------------------------------------
AIC: Akaike Information Criteria 
 SBIC: Sawa's Bayesian Information Criteria 
 SBC: Schwarz Bayesian Criteria 
 MSEP: Estimated error of prediction, assuming multivariate normality 
 FPE: Final Prediction Error 
 HSP: Hocking's Sp 
 APC: Amemiya Prediction Criteria 
reg_model= crossValidationCont(train,3,0.5,6)
weighted_model = crossValidationCont2(train,3,0.5,6)
transformed_model = crossValidationCont3(train,3,0.8,6)
d = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=train)
plot(d$fitted.values,d$residuals, ylab="Residuals", xlab = "Fitted Values", main = "OLS Residuals")

plot(train$year,d$residuals, ylab="Residuals", xlab = "Year",main = "Year against OLS Residuals")

de = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=train, weights = 1/(train$year)^2)
plot(de$fitted.values,de$residuals, ylab="Residuals", xlab = "Fitted Values",main = "WLS Residuals")

plot(reg_model$models[[1]]$fitted.values,reg_model$models[[1]]$residuals)

plot(reg_model$models[[2]]$fitted.values,reg_model$models[[2]]$residuals)

plot(reg_model$models[[3]]$fitted.values,reg_model$models[[3]]$residuals)

plot(reg_model$year[[1]],reg_model$models[[1]]$residuals)

plot(reg_model$year[[2]],reg_model$models[[2]]$residuals)

plot(reg_model$year[[3]],reg_model$models[[3]]$residuals)

plot(weighted_model$models[[1]]$fitted.values,weighted_model$models[[1]]$residuals)

plot(weighted_model$models[[2]]$fitted.values,weighted_model$models[[2]]$residuals)

plot(weighted_model$models[[3]]$fitted.values,weighted_model$models[[3]]$residuals)

plot(transformed_model$models[[1]]$fitted.values,transformed_model$models[[1]]$residuals, main = "Fold 1", xlab = "Fitted Values", ylab = "Residuals")

plot(transformed_model$models[[2]]$fitted.values,transformed_model$models[[2]]$residuals, main = "Fold 2", xlab = "Fitted Values", ylab = "Residuals")

plot(transformed_model$models[[3]]$fitted.values,transformed_model$models[[3]]$residuals, main = "Fold 3", xlab = "Fitted Values", ylab = "Residuals")

res1 = transformed_model$price[[1]]  - exp(transformed_model$models[[1]]$fitted.values)
longer object length is not a multiple of shorter object length
res2 = transformed_model$price[[2]]  - exp(transformed_model$models[[2]]$fitted.values)
longer object length is not a multiple of shorter object length
res3 = transformed_model$price[[3]]  - exp(transformed_model$models[[3]]$fitted.values)
longer object length is not a multiple of shorter object length
plot(exp(transformed_model$models[[1]]$fitted.values),res1, main = "Fold 1", xlab = "Fitted Values", ylab = "Residuals")

plot(exp(transformed_model$models[[2]]$fitted.values),res2, main = "Fold 2", xlab = "Fitted Values", ylab = "Residuals")

plot(exp(transformed_model$models[[3]]$fitted.values),res3, main = "Fold 3", xlab = "Fitted Values", ylab = "Residuals")

---
title: "R Notebook"
output: html_notebook
---
# Setup
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(mlbench)
library(randomForest)
library(party)
library(rpart)
library(rpart.plot)
library(leaps)
library(olsrr)
library(magrittr)
library(tibble)
```

```{r}
n<-nrow(subset_selected)
set.seed(123)
n_s = sample(n,10000)
train = subset_selected[n_s, ]
```


```{r}
crossValidationCont = function(df,K,a,nperfmeas=6)
{ set.seed(123)
  n = nrow(df)
  nhold = round(n/K) # size of holdout set 
  iperm = sample(n)
  perfmeas = matrix(0,K,nperfmeas)  
  models = vector(mode="list", length=3)
  year = vector(mode="list", length=3)
  for(k in 1:K)
  { indices = (((k-1)*nhold+1):(k*nhold))
    if( k==K ) indices = (((k-1)*nhold+1):n)
    indices = iperm[indices]
    traine = df[-indices,]
    holdoute = df[indices,]
    models[[k]] = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=traine)
    pred = predict(models[[k]],newdata=holdoute,interval="prediction",level=a)
    RSS = c(crossprod(models[[k]]$residuals))
    MSE = RSS / length(models[[k]]$residuals)
    x = intervalScore(pred, holdoute$price, a)
    y= sqrt(MSE)
    z = summary(models[[k]])$r.squared
    perfmeas[k,1:4] = x$summary
    perfmeas[k,5] = y
    perfmeas[k,6] = z
    year[[k]] = traine$year
  }
  avgperfmeas = apply(perfmeas,2,mean)
  list(perfmeasbyfold=perfmeas, avgperfmeas=avgperfmeas, models= models, year= year)
}
```

```{r}
crossValidationCont2 = function(df,K,a,nperfmeas=6)
{ set.seed(123)
  n = nrow(df)
  nhold = round(n/K) # size of holdout set 
  iperm = sample(n)
  perfmeas = matrix(0,K,nperfmeas)  
  models = vector(mode="list", length=3)
  year = vector(mode="list", length=3)
  predi = vector(mode="list", length=3)
  pri = vector(mode="list", length=3)
  for(k in 1:K)
  { indices = (((k-1)*nhold+1):(k*nhold))
    if( k==K ) indices = (((k-1)*nhold+1):n)
    indices = iperm[indices]
    traine = df[-indices,]
    holdoute = df[indices,]
    models[[k]] = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=traine, weights = 1/(traine$year)^2)
    pred = predict(models[[k]],newdata=holdoute,interval="prediction",level=a, weights = 1/(holdoute$year)^2)
    RSS = c(crossprod(models[[k]]$residuals))
    MSE = RSS / length(models[[k]]$residuals)
    x = intervalScore(pred, holdoute$price, a)
    y= sqrt(MSE)
    z = summary(models[[k]])$r.squared
    perfmeas[k,1:4] = x$summary
    perfmeas[k,5] = y
    perfmeas[k,6] = z
    year[[k]] = traine$year
    predi[[k]] = pred
    pri[[k]] = holdoute$price
  }
  avgperfmeas = apply(perfmeas,2,mean)
  list(perfmeasbyfold=perfmeas, avgperfmeas=avgperfmeas, models= models, year= year, pred = predi, price = pri)
}
```

```{r}
crossValidationCont3 = function(df,K,a,nperfmeas=6)
{ set.seed(123)
  n = nrow(df)
  nhold = round(n/K) # size of holdout set 
  iperm = sample(n)
  perfmeas = matrix(0,K,nperfmeas)  
  models = vector(mode="list", length=3)
  year = vector(mode="list", length=3)
  predi = vector(mode="list", length=3)
  pri = vector(mode="list", length=3)
  for(k in 1:K)
  { indices = (((k-1)*nhold+1):(k*nhold))
    if( k==K ) indices = (((k-1)*nhold+1):n)
    indices = iperm[indices]
    traine = df[-indices,]
    holdoute = df[indices,]
    models[[k]] = lm(I(log(price))~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=traine)
    pred1 = predict(models[[k]],newdata=holdoute,interval="prediction",level=a)
    pred = exp(pred1)
    RSS = c(crossprod(models[[k]]$residuals))
    MSE = RSS / length(models[[k]]$residuals)
    x = intervalScore(pred, holdoute$price, a)
    y= sqrt(MSE)
    z = summary(models[[k]])$r.squared
    perfmeas[k,1:4] = x$summary
    perfmeas[k,5] = y
    perfmeas[k,6] = z
    year[[k]] = traine$year
    predi[[k]] = pred
    pri[[k]] = holdoute$price
  }
  avgperfmeas = apply(perfmeas,2,mean)
  list(perfmeasbyfold=perfmeas, avgperfmeas=avgperfmeas, models= models, year= year, pred = predi, price = pri)
}
```

```{r}
intervalScore = function(predObj,actual,level) { 
n = nrow(predObj)
alpha = 1-level
ilow = (actual<predObj[,2]) # overestimation
ihigh = (actual>predObj[,3]) # underestimation
sumlength = sum(predObj[,3]-predObj[,2]) # sum of lengths of prediction intervals 
sumlow = sum(predObj[ilow,2]-actual[ilow])*2/alpha
sumhigh = sum(actual[ihigh]-predObj[ihigh,3])*2/alpha
avglength = sumlength/n
IS = (sumlength+sumlow+sumhigh)/n # average length + average under/over penalties 
cover = mean(actual>= predObj[,2] & actual<=predObj[,3])
summ = c(level,avglength,IS,cover)
# summary with level, average length, interval score, coverage rate, r^2, rmse
imiss = which(ilow | ihigh)
list(summary=summ, imiss=imiss)
}
ols_step_best_subset(lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type+gdpc,data=train))
reg_model= crossValidationCont(train,3,0.5,6)
weighted_model = crossValidationCont2(train,3,0.5,6)
transformed_model = crossValidationCont3(train,3,0.8,6)
```

```{r}
d = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=train)
plot(d$fitted.values,d$residuals, ylab="Residuals", xlab = "Fitted Values", main = "OLS Residuals")
plot(train$year,d$residuals, ylab="Residuals", xlab = "Year",main = "Year against OLS Residuals")
de = lm(price~mark_cat+year+mileage_cat+log_vol_engine+fuel+car_type,data=train, weights = 1/(train$year)^2)
plot(de$fitted.values,de$residuals, ylab="Residuals", xlab = "Fitted Values",main = "WLS Residuals")
```


```{r}
plot(reg_model$models[[1]]$fitted.values,reg_model$models[[1]]$residuals)
plot(reg_model$models[[2]]$fitted.values,reg_model$models[[2]]$residuals)
plot(reg_model$models[[3]]$fitted.values,reg_model$models[[3]]$residuals)
```

```{r}
plot(reg_model$year[[1]],reg_model$models[[1]]$residuals)
plot(reg_model$year[[2]],reg_model$models[[2]]$residuals)
plot(reg_model$year[[3]],reg_model$models[[3]]$residuals)
```

```{r}
plot(weighted_model$models[[1]]$fitted.values,weighted_model$models[[1]]$residuals)
plot(weighted_model$models[[2]]$fitted.values,weighted_model$models[[2]]$residuals)
plot(weighted_model$models[[3]]$fitted.values,weighted_model$models[[3]]$residuals)
```
```{r}
plot(transformed_model$models[[1]]$fitted.values,transformed_model$models[[1]]$residuals, main = "Fold 1", xlab = "Fitted Values", ylab = "Residuals")
plot(transformed_model$models[[2]]$fitted.values,transformed_model$models[[2]]$residuals, main = "Fold 2", xlab = "Fitted Values", ylab = "Residuals")
plot(transformed_model$models[[3]]$fitted.values,transformed_model$models[[3]]$residuals, main = "Fold 3", xlab = "Fitted Values", ylab = "Residuals")
```

```{r}
res1 = transformed_model$price[[1]]  - exp(transformed_model$models[[1]]$fitted.values)
res2 = transformed_model$price[[2]]  - exp(transformed_model$models[[2]]$fitted.values)
res3 = transformed_model$price[[3]]  - exp(transformed_model$models[[3]]$fitted.values)
plot(exp(transformed_model$models[[1]]$fitted.values),res1, main = "Fold 1", xlab = "Fitted Values", ylab = "Residuals")
plot(exp(transformed_model$models[[2]]$fitted.values),res2, main = "Fold 2", xlab = "Fitted Values", ylab = "Residuals")
plot(exp(transformed_model$models[[3]]$fitted.values),res3, main = "Fold 3", xlab = "Fitted Values", ylab = "Residuals")
```